Customer records and SQL queries

Author: Chisheng Li

This project quickly conduct the appropriate SQL queries from 2 input XML files customer.xml and nation.xml. Create the tables in the database from the XML files and perform retrival operations on the tables that were created.

Output: customer.db, desiredCustomers.txt, marketSegments.txt and nationsBalance.txt.


In [1]:
import xml.etree.ElementTree as ET
import sqlite3 as sqlite
import sys

In [2]:
def check_result(result, desired):
    if result != desired:
        print "NOT_OK.  Expected " + desired + "; but got "+result
    else:
        print "OK!"
    return

In [3]:
def read_xml_file(xml_filename, field_types):
    tree = ET.parse(xml_filename)
    root = tree.getroot()
    lot = []
    for child in root:
        fields = []
        for gc in child:
            fields.append(gc.text)

        field_list = []
        count = 0
        for f in fields:
            t = field_types[count]
            if t == 'int':
                field_list.append(int(f))
            elif t == 'string':
                field_list.append(f)
            elif t == 'float':
                field_list.append(float(f))
            else:
                field_list.append(f)
            count = count + 1
        
        lot.append(tuple(field_list))
    return lot;

The format of the customer.xml file records:

  <T>
    <C_CUSTKEY>1</C_CUSTKEY>
    <C_NAME>Customer#000000001</C_NAME>
    <C_ADDRESS>IVhzIApeRb ot,c,E</C_ADDRESS>
    <C_NATIONKEY>15</C_NATIONKEY>
    <C_PHONE>25-989-741-2988</C_PHONE>
    <C_ACCTBAL>711.56</C_ACCTBAL>
    <C_MKTSEGMENT>BUILDING</C_MKTSEGMENT>
    <C_COMMENT>regular, regular platelets are fluffily according to the even attainments. blithely iron</C_COMMENT>
  </T>

In [10]:
lot = read_xml_file(r'customer.xml',
                    ['int', 'string', 'string', 'int', 'string', 'float', 'string', 'string'])

with sqlite.connect(r'customer.db') as con: 
    cur = con.cursor()
    # Write a SQL query to erase any existing "customer" table in the database
    cur.execute("drop table if exists customer;")
    
    # Write a SQL query to create a new "customer" table with the following columns:
    # C_CUSTKEY (integer)  C_NAME (text)  C_ADDRESS (text)  C_NATIONKEY (integer)  C_PHONE (text)  C_ACCTBAL (real)  C_MKTSEGMENT (text)  C_COMMENT (text)
    cur.execute("create table customer(C_CUSTKEY integer, C_NAME text, C_ADDRESS text, C_NATIONKEY integer, C_PHONE text, C_ACCTBAL real, C_MKTSEGMENT text, C_COMMENT text)")
    
    # Write a SQL query to insert the values in the list-of-tuples variable lot into the customer table
    cur.executemany("insert into customer values(?,?,?,?,?,?,?,?)", lot)
    con.commit()
    
    # Write a SQL query to compute the count of all records (rows) in the customer table
    cur.execute("select Count(*) from customer;")
    print "customer count: " 
    check_result(str(cur.fetchone()[0]), "1500")
    
    # Write a SQL query to compute the average account balance (C_ACCTBAL) over all rows in the customer table.
    cur.execute("select avg(C_ACCTBAL) from customer;")
    print "customer average: "
    check_result(str(cur.fetchone()[0]), "4454.57706")
    
    # Write a SQL query to get the C_CUSTKEY, C_NAME, and C_ACCTBAL columns from the customer table that
    # have C_ACCTBAL >= 1000, and sort the results by C_CUSTKEY
    cur.execute("select C_CUSTKEY, C_NAME, C_ACCTBAL from customer where (C_ACCTBAL >= '1000') order by C_CUSTKEY; ")
    
    # fetchall() method gets all records. It returns a list of tuples.
    # Each of the tuples represent a row in the table.
    rows = cur.fetchall()
    
    f = open(r'desiredCustomers.txt', 'w')
    f.write('Customer Key' + '\t' + 'Name' + '\t' + 'Account Balance\n')
    for row in rows:
        f.write('\t'.join([str(x) for x in row]) + '\n')
    f.close()
    
    # Write a SQL query to sum the C_ACCTBAL field by market segment C_MKTSEGMENT (call this total column T_ACCTBAL) and sorted by descending T_ACCTBAL
    # Output columns should be C_MKTSEGMENT, and the sum over C_ACCTBAL (sorted by descending amount)
    cur.execute("select C_MKTSEGMENT, sum(C_ACCTBAL) as T_ACCTBAL from customer group by C_MKTSEGMENT order by T_ACCTBAL desc; ")
    # fetchall() method gets all records. It returns a list of tuples.
    # Each of the tuples represent a row in the table.
    rows = cur.fetchall()
    
    f = open(r'marketSegments.txt', 'w')
    f.write('Market Segment' + '\t' + 'Total Customer Account Balance\n')
    for row in rows:
        f.write('\t'.join([str(x) for x in row]) + '\n')
    f.close()


customer count: 
OK!
customer average: 
OK!

The format of the nation.xml file records:

  <T>
      <N_NATIONKEY>0</N_NATIONKEY>
      <N_NAME>ALGERIA</N_NAME>
      <N_REGIONKEY>0</N_REGIONKEY>
      <N_COMMENT>final accounts wake quickly. special request</N_COMMENT>
  </T>

In [6]:
lot = read_xml_file(r'nation.xml', ['int', 'string', 'int', 'string'])

with sqlite.connect(r'customer.db') as con: 
    cur = con.cursor()
    # Write a SQL query to erase any existing "nation" table in the database
    cur.execute("drop table if exists nation;")
    
    # Write a SQL query to create a new "nation" table with the columns:  N_NATIONKEY (integer)  N_NAME (string)  N_REGIONKEY (integer)  C_COMMENT (text)
    cur.execute("create table nation(N_NATIONKEY integer, N_NAME string, N_REGIONKEY integer, C_COMMENT text)")
    
    # Write a single SQL query and cursor command here to insert all the tuples into the "nation" table, using the list-of-tuples variable "lot"
    cur.executemany("insert into nation values(?,?,?,?)", lot)
    con.commit()
    
    # Write a SQL query to compute the total account balance (C_ACCTBAL) in the customer table for each nation.  The output columns
    # should be N_NAME (country name) from the "nation" table and the total account balance, i.e. the sum of C_ACCTBAL in the "customer" table
    # for records grouped by country.  The result should be sorted alphabetically by country name.
    cur.execute("select nation.N_NAME, sum(customer.C_ACCTBAL) from nation, customer where customer.C_NATIONKEY=nation.N_NATIONKEY group by nation.N_NAME order by nation.N_NAME")
    # Each of the tuples represent a row in the table.
    rows = cur.fetchall()
    
    f = open(r'nationsBalance.txt', 'w')
    f.write('Nation Name' + '\t' + 'Total Customer Account Balance\n')
    for row in rows:
        f.write('\t'.join([str(x) for x in row]) + '\n')
    f.close()

In [ ]: